knitr::opts_chunk$set(echo = TRUE)
library(usmap)
library(ggplot2)
library(maps)
library(ggsn)
## Loading required package: grid
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ tibble  3.1.4     ✓ dplyr   1.0.7
## ✓ tidyr   1.1.3     ✓ stringr 1.4.0
## ✓ readr   2.0.1     ✓ forcats 0.5.1
## ✓ purrr   0.3.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks plotly::filter(), stats::filter()
## x dplyr::lag()    masks stats::lag()
## x purrr::map()    masks maps::map()

Introduction

This provides insight and analysis into Beer and Brewery data across the U.S. provided by Budweiser Executives for strategic decision making.

#library(tidyverse)
beers = read_csv(url("https://raw.githubusercontent.com/BivinSadler/MSDS_6306_Doing-Data-Science/Master/Unit%208%20and%209%20Case%20Study%201/Beers.csv"))
## Rows: 2410 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): Name, Style
## dbl (5): Beer_ID, ABV, IBU, Brewery_id, Ounces
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
breweries = read_csv(url("https://raw.githubusercontent.com/BivinSadler/MSDS_6306_Doing-Data-Science/Master/Unit%208%20and%209%20Case%20Study%201/Breweries.csv"))
## Rows: 558 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): Name, City, State
## dbl (1): Brew_ID
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
summary(beers)
##      Name              Beer_ID            ABV               IBU        
##  Length:2410        Min.   :   1.0   Min.   :0.00100   Min.   :  4.00  
##  Class :character   1st Qu.: 808.2   1st Qu.:0.05000   1st Qu.: 21.00  
##  Mode  :character   Median :1453.5   Median :0.05600   Median : 35.00  
##                     Mean   :1431.1   Mean   :0.05977   Mean   : 42.71  
##                     3rd Qu.:2075.8   3rd Qu.:0.06700   3rd Qu.: 64.00  
##                     Max.   :2692.0   Max.   :0.12800   Max.   :138.00  
##                                      NA's   :62        NA's   :1005    
##    Brewery_id       Style               Ounces     
##  Min.   :  1.0   Length:2410        Min.   : 8.40  
##  1st Qu.: 94.0   Class :character   1st Qu.:12.00  
##  Median :206.0   Mode  :character   Median :12.00  
##  Mean   :232.7                      Mean   :13.59  
##  3rd Qu.:367.0                      3rd Qu.:16.00  
##  Max.   :558.0                      Max.   :32.00  
## 
summary(breweries)
##     Brew_ID          Name               City              State          
##  Min.   :  1.0   Length:558         Length:558         Length:558        
##  1st Qu.:140.2   Class :character   Class :character   Class :character  
##  Median :279.5   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :279.5                                                           
##  3rd Qu.:418.8                                                           
##  Max.   :558.0
head(beers)
## # A tibble: 6 × 7
##   Name                Beer_ID   ABV   IBU Brewery_id Style                Ounces
##   <chr>                 <dbl> <dbl> <dbl>      <dbl> <chr>                 <dbl>
## 1 Pub Beer               1436 0.05     NA        409 American Pale Lager      12
## 2 Devil's Cup            2265 0.066    NA        178 American Pale Ale (…     12
## 3 Rise of the Phoenix    2264 0.071    NA        178 American IPA             12
## 4 Sinister               2263 0.09     NA        178 American Double / I…     12
## 5 Sex and Candy          2262 0.075    NA        178 American IPA             12
## 6 Black Exodus           2261 0.077    NA        178 Oatmeal Stout            12
head(breweries)
## # A tibble: 6 × 4
##   Brew_ID Name                      City          State
##     <dbl> <chr>                     <chr>         <chr>
## 1       1 NorthGate Brewing         Minneapolis   MN   
## 2       2 Against the Grain Brewery Louisville    KY   
## 3       3 Jack's Abby Craft Lagers  Framingham    MA   
## 4       4 Mike Hess Brewing Company San Diego     CA   
## 5       5 Fort Point Beer Company   San Francisco CA   
## 6       6 COAST Brewing Company     Charleston    SC
  1. How many breweries are present in each state? # Breweries by State
#state_data -- Breweries Filtered by State - joined with FIPS code by State
#mutate - Join FIPS data with Breweries data
#trimws - remove Leading/Trailing Whitespace from column(State)
#fips = State by Abbreviation - ie. TX

state_data = breweries %>% count(State) %>% mutate(fips = fips(trimws(State)))

us_map <- usmap::us_map() # used to add map scale

usmap::plot_usmap(data = state_data, values = "n", labels = T)+
  labs(fill = 'Breweries by State') + 
  scale_fill_gradientn(colours=rev(topo.colors(50)),na.value="grey90",
                       guide = guide_colourbar(barwidth = 25, barheight = 0.6,
#put legend title on top of legend
title.position = "top")) +

  # put legend at the bottom, adjust legend title and text font sizes
  theme(legend.position = "bottom",
        legend.title=element_text(size=12), 
        legend.text=element_text(size=10))

  1. Merge beer data with the breweries data. Print the first 6 observations and the last six observations to check the merged file. (RMD only, this does not need to be included in the presentation or the deck.)
#Merge beer and breweries data - trim whitespace - add in FIPS codes
full_data = merge(beers,breweries,by.y="Brew_ID",by.x="Brewery_id") %>%
  mutate(fips = fips(trimws(State)))

head(full_data)
##   Brewery_id        Name.x Beer_ID   ABV IBU
## 1          1  Get Together    2692 0.045  50
## 2          1 Maggie's Leap    2691 0.049  26
## 3          1    Wall's End    2690 0.048  19
## 4          1       Pumpion    2689 0.060  38
## 5          1    Stronghold    2688 0.060  25
## 6          1   Parapet ESB    2687 0.056  47
##                                 Style Ounces            Name.y        City
## 1                        American IPA     16 NorthGate Brewing Minneapolis
## 2                  Milk / Sweet Stout     16 NorthGate Brewing Minneapolis
## 3                   English Brown Ale     16 NorthGate Brewing Minneapolis
## 4                         Pumpkin Ale     16 NorthGate Brewing Minneapolis
## 5                     American Porter     16 NorthGate Brewing Minneapolis
## 6 Extra Special / Strong Bitter (ESB)     16 NorthGate Brewing Minneapolis
##   State fips
## 1    MN   27
## 2    MN   27
## 3    MN   27
## 4    MN   27
## 5    MN   27
## 6    MN   27
tail(full_data)
##      Brewery_id                    Name.x Beer_ID   ABV IBU
## 2405        556             Pilsner Ukiah      98 0.055  NA
## 2406        557  Heinnieweisse Weissebier      52 0.049  NA
## 2407        557           Snapperhead IPA      51 0.068  NA
## 2408        557         Moo Thunder Stout      50 0.049  NA
## 2409        557         Porkslap Pale Ale      49 0.043  NA
## 2410        558 Urban Wilderness Pale Ale      30 0.049  NA
##                        Style Ounces                        Name.y          City
## 2405         German Pilsener     12         Ukiah Brewing Company         Ukiah
## 2406              Hefeweizen     12       Butternuts Beer and Ale Garrattsville
## 2407            American IPA     12       Butternuts Beer and Ale Garrattsville
## 2408      Milk / Sweet Stout     12       Butternuts Beer and Ale Garrattsville
## 2409 American Pale Ale (APA)     12       Butternuts Beer and Ale Garrattsville
## 2410        English Pale Ale     12 Sleeping Lady Brewing Company     Anchorage
##      State fips
## 2405    CA   06
## 2406    NY   36
## 2407    NY   36
## 2408    NY   36
## 2409    NY   36
## 2410    AK   02
full_data[sample(1:nrow(full_data),6),]
##      Brewery_id                  Name.x Beer_ID   ABV IBU
## 1648        323          Good Vibes IPA    1603 0.073  85
## 1623        315          Old Tom Porter     847 0.055  25
## 1625        315 McKinney Eddy Amber Ale     521 0.055  20
## 829         140            Arkansas Red    1650 0.052  NA
## 2209        479    Coffee Oatmeal Stout     311 0.060  54
## 2139        454              Mother Ale    1159 0.056  46
##                         Style Ounces                            Name.y
## 1648             American IPA     12                    Voodoo Brewery
## 1623          American Porter     16       Piney River Brewing Company
## 1625 American Amber / Red Ale     16       Piney River Brewing Company
## 829  American Amber / Red Ale     12 Core Brewing & Distilling Company
## 2209            Oatmeal Stout     12       Good People Brewing Company
## 2139      American Blonde Ale     12            Denali Brewing Company
##            City State fips
## 1648  Meadville    PA   42
## 1623    Bucryus    MO   29
## 1625    Bucryus    MO   29
## 829  Springdale    AR   05
## 2209 Birmingham    AL   01
## 2139  Talkeetna    AK   02
  1. Address the missing values in each column.

Two Datasets, “Beers.csv” and “Breweries.csv” were joined to create a Full Dataset for analysis and insight

Only ABV and IBU are missing Data Values

Important to note: IBU is missing values in around 42% of the 2,410 total rows of data

**Predictions for IBU specifically, may be skewed

#use colSum to sum up the NA's in each column
full_dataNAs <- as.data.frame(colSums(is.na(full_data)))
names(full_dataNAs) <- c("Missing Values (NAs)")
full_dataNAs
##            Missing Values (NAs)
## Brewery_id                    0
## Name.x                        0
## Beer_ID                       0
## ABV                          62
## IBU                        1005
## Style                         5
## Ounces                        0
## Name.y                        0
## City                          0
## State                         0
## fips                          0
summary(full_data)
##    Brewery_id       Name.x             Beer_ID            ABV         
##  Min.   :  1.0   Length:2410        Min.   :   1.0   Min.   :0.00100  
##  1st Qu.: 94.0   Class :character   1st Qu.: 808.2   1st Qu.:0.05000  
##  Median :206.0   Mode  :character   Median :1453.5   Median :0.05600  
##  Mean   :232.7                      Mean   :1431.1   Mean   :0.05977  
##  3rd Qu.:367.0                      3rd Qu.:2075.8   3rd Qu.:0.06700  
##  Max.   :558.0                      Max.   :2692.0   Max.   :0.12800  
##                                                      NA's   :62       
##       IBU            Style               Ounces         Name.y         
##  Min.   :  4.00   Length:2410        Min.   : 8.40   Length:2410       
##  1st Qu.: 21.00   Class :character   1st Qu.:12.00   Class :character  
##  Median : 35.00   Mode  :character   Median :12.00   Mode  :character  
##  Mean   : 42.71                      Mean   :13.59                     
##  3rd Qu.: 64.00                      3rd Qu.:16.00                     
##  Max.   :138.00                      Max.   :32.00                     
##  NA's   :1005                                                          
##      City              State               fips          
##  Length:2410        Length:2410        Length:2410       
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
## 
1005/2410 #Calculate percentage missing data from IBU Column
## [1] 0.4170124
  1. Compute the median alcohol content and international bitterness unit for each state. Plot a bar chart to compare. # Alcohol Content, Bitterness by State - BarChart
#Preliminary Steps

#Import Beer and Brewery Data Sets
Beer = read.csv("https://raw.githubusercontent.com/BivinSadler/MSDS_6306_Doing-Data-Science/Master/Unit%208%20and%209%20Case%20Study%201/Beers.csv", header = TRUE)
Breweries = read.csv("https://raw.githubusercontent.com/BivinSadler/MSDS_6306_Doing-Data-Science/Master/Unit%208%20and%209%20Case%20Study%201/Breweries.csv", header = TRUE)

#Updating the name of the first column of the breweries dataset so there is a matching column between the two datasets
names(Breweries)[1] <- 'Brewery_id'

#Merging the two sets together by the Brewery ID column
BB = merge(Beer,Breweries, by="Brewery_id", all = TRUE)
#Updating the mame of the 8th column so that it is Brewery
names(BB)[8] <- 'Brewery'
#Omitting all N/A values from the ABV and IBU columns
x = na.omit(BB[,c(4,10)])
y = na.omit(BB[,c(5,10)])

Alcohol by Volume Comparison by State

#Finding the median ABV of each state
medians = aggregate(x=x$ABV, by = list(x$State), FUN = median)
#Finding the median IBU of each state
mediansIBU = aggregate(x=y$IBU, by = list(y$State), FUN = median)
#Updating column names of the median ABV dataset
names(medians)[1]<- 'State'
names(medians)[2]<- 'ABV'
#Updating column names of the median IBU dataset
names(mediansIBU)[1]<- 'State'
names(mediansIBU)[2]<- 'IBU'

#Plotting the median ABV of each state
p = medians %>%
ggplot(mapping = aes(x=State, y=ABV)) + geom_bar(stat = 'identity', fill = topo.colors(51), width = .5,color = 'black', show.legend = TRUE) + labs(y= "Median ABV", title = "Median ABV of Beers by State") 

ggplotly(p) #Add interactive data labels

Bitterness Comparison by State

#Plotting the median IBU of each state
p = mediansIBU %>%
ggplot(mapping = aes(x=State, y=IBU)) + geom_bar(stat = 'identity', fill = topo.colors(50), width = .5,color = 'black') + labs(y= "Median IBU", title = "Median IBU of Beers by State") 

ggplotly(p) #Add interacitve data labels

5a. Which state has the maximum alcoholic (ABV) beer? #Max ABV = CO

#Finding the highest ABV of all the states
MaxABV = aggregate(x$ABV, by = list(x$State),max)
#Finding the highest IBU of all the states
MaxIBU = aggregate(y$IBU, by = list(y$State),max)
#Updating column names of the max IBU dataset
names(MaxIBU)[1]<- 'State'
names(MaxIBU)[2]<- 'IBU'
#Updating column names of the max ABV dataset
names(MaxABV)[1]<- 'State'
names(MaxABV)[2]<- 'ABV'
#Order the MAX ABV and IBU datasets in descending order
MaxABV = BB[order(BB$ABV, decreasing = TRUE),]
MaxIBU = BB[order(BB$IBU, decreasing = TRUE),]

#Plotting the Max ABV of each state
p = MaxABV %>%
ggplot(mapping = aes(x=State, y=ABV)) + geom_bar(stat = "identity", fill = "darkslateblue") + labs(y="ABV", title = "Highest ABVs per State") 

ggplotly(p)
## Warning: Removed 62 rows containing missing values (position_stack).

5b. Which state has the most bitter (IBU) beer? #Most Bitter Beer = OR

#Plotting the Max IBU of each state
p = MaxIBU %>%
ggplot(mapping = aes(x=State, y=IBU)) + geom_bar(stat = "identity", fill = "chartreuse3", width = .5,color = 'darkslateblue') + labs(y="IBU", title = "Highest IBUs per State") 
ggplotly(p)
## Warning: Removed 1005 rows containing missing values (position_stack).
  1. Comment on the summary statistics and distribution of the ABV variable.
#Budweiser

Beers = read_csv(url("https://raw.githubusercontent.com/BivinSadler/MSDS_6306_Doing-Data-Science/Master/Unit%208%20and%209%20Case%20Study%201/Beers.csv"))
## Rows: 2410 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): Name, Style
## dbl (5): Beer_ID, ABV, IBU, Brewery_id, Ounces
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Breweries = read_csv(url("https://raw.githubusercontent.com/BivinSadler/MSDS_6306_Doing-Data-Science/Master/Unit%208%20and%209%20Case%20Study%201/Breweries.csv"))
## Rows: 558 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): Name, City, State
## dbl (1): Brew_ID
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#merge the data before performing EDA
names(Breweries)[1] <- 'Brewery_id'
Breweries
## # A tibble: 558 × 4
##    Brewery_id Name                         City          State
##         <dbl> <chr>                        <chr>         <chr>
##  1          1 NorthGate Brewing            Minneapolis   MN   
##  2          2 Against the Grain Brewery    Louisville    KY   
##  3          3 Jack's Abby Craft Lagers     Framingham    MA   
##  4          4 Mike Hess Brewing Company    San Diego     CA   
##  5          5 Fort Point Beer Company      San Francisco CA   
##  6          6 COAST Brewing Company        Charleston    SC   
##  7          7 Great Divide Brewing Company Denver        CO   
##  8          8 Tapistry Brewing             Bridgman      MI   
##  9          9 Big Lake Brewing             Holland       MI   
## 10         10 The Mitten Brewing Company   Grand Rapids  MI   
## # … with 548 more rows
Beer_Merge <- merge(x=Beers,y=Breweries, by = "Brewery_id", all.x = TRUE)

lookup = data.frame(abb = state.abb, State = state.name) #makes a data frame with State name and abbreviation.

summary(Beer_Merge$ABV) #see summary statistics of mean, median, mode, and quartlies
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
## 0.00100 0.05000 0.05600 0.05977 0.06700 0.12800      62
#plot the distribution of ABV
Beer_Merge %>%
  ggplot(aes(x = ABV)) + geom_density(binwidth = .5, colour="black",fill="chartreuse3") +
  geom_vline(aes(xintercept=mean(ABV, na.rm=T)),
             color = "red", linetype = "dashed", size = 1) +
  ggtitle("Distribution Density of ABV")
## Warning: Ignoring unknown parameters: binwidth
## Warning: Removed 62 rows containing non-finite values (stat_density).

  1. Is there an apparent relationship between the bitterness of the beer and its alcoholic content? Draw a scatter plot. Make your best judgment of a relationship and EXPLAIN your answer.

There appears to a positive correlation between ABV and IBU however, IBU values may be skewed due to missing IBU data - 42% of total missing

Beer_Merge %>%
  ggplot(aes(x = ABV, y = IBU, fill = "sample")) +
  geom_point() + geom_smooth(method =  lm) +
  ggtitle("Relationship Between IBU and ABV")
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 1005 rows containing non-finite values (stat_smooth).
## Warning: Removed 1005 rows containing missing values (geom_point).

correlation_test = cor.test(Beer_Merge$ABV,Beer_Merge$IBU) #run correlation to determine p value and r-sqaured
correlation_test #showcase statistics
## 
##  Pearson's product-moment correlation
## 
## data:  Beer_Merge$ABV and Beer_Merge$IBU
## t = 33.863, df = 1403, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.6407982 0.6984238
## sample estimates:
##       cor 
## 0.6706215
#The scatterplot shows a positive correlation between level of bitterness and alcohol by volume indicating that the higher the bitterness the more alcohol that tends to be contatined within

#Even with a sample of 1005 beers out of 2410 observations in total of IBU, we can conclude that a p value of 2.2e-16 we can say,
#we should reject the null hypothesis given an intercept of 0, which is further proved by the confidence interval
#being .64 to .69 at alpha of .05 or 95%.
#There appears to be a strong relationship with a r-squared correlation of .6706 squared = .45.
#It's a fair to state that when the ABV or IBU is increased the other shall also increase.

##Bonus Insights

#Top 2 States _____#CO | #CA - Top Beer Styles for Top 2 States 

COStyle = full_data %>% filter(fips == "08") %>% select(State,Style) %>% group_by(State) %>% 
  count(Style) %>% slice_max(n,n=5) %>% mutate(fips = fips(trimws(State)))

#Colorado Top Beer Styles BarChart
p = COStyle %>% 
  arrange(desc(COStyle$Style)) %>% 
  mutate(Style = factor(Style)) %>% 
  ggplot(aes(y=Style,x=COStyle$n, fill = COStyle$n)) + geom_bar(stat='identity', show.legend = TRUE, width = .8) +
  labs(fill = 'CO Beer Styles') +
  ggtitle("Top Colorado Beer Styles")

ggplotly(p)
CAStyle = full_data %>% filter(fips == "06") %>% select(State,Style) %>% group_by(State) %>% 
  count(Style) %>% slice_max(n,n=5) %>% mutate(fips = fips(trimws(State)))


#California Top Beer Styles BarChart
p = CAStyle %>% 
  arrange(desc(CAStyle$Style)) %>% 
  mutate(Style = factor(Style)) %>% 
  ggplot(mapping = aes(y=Style,x=CAStyle$n, fill = CAStyle$n)) + geom_bar(stat='identity', show.legend = TRUE, width = .8) +
  labs(fill = 'CA Beer Styles') +
ggtitle("Top California Beer Styles")
  
ggplotly(p)

Ale Classification by Alcohol Content and Bitterness